<!DOCTYPE html>
<html lang="en-us">
  <head>

    <meta http-equiv="content-type" content="text/html; charset=utf-8">
    
<meta charset="UTF-8">
<title>Stopwords and Phrase Queries | Elasticsearch: The Definitive Guide [2.x] | Elastic</title>
<link rel="home" href="index.html" title="Elasticsearch: The Definitive Guide [2.x]">
<link rel="up" href="stopwords.html" title="Stopwords: Performance Versus Precision">
<link rel="prev" href="common-terms.html" title="Divide and Conquer">
<link rel="next" href="common-grams.html" title="common_grams Token Filter">
<meta name="DC.type" content="Learn/Docs/Legacy/Elasticsearch/Definitive Guide/2.x">
<meta name="DC.subject" content="Elasticsearch">
<meta name="DC.identifier" content="2.x">
<meta name="robots" content="noindex,nofollow">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <script src="https://cdn.optimizely.com/js/18132920325.js"></script>
    <link rel="apple-touch-icon" sizes="57x57" href="/apple-icon-57x57.png">
    <link rel="apple-touch-icon" sizes="60x60" href="/apple-icon-60x60.png">
    <link rel="apple-touch-icon" sizes="72x72" href="/apple-icon-72x72.png">
    <link rel="apple-touch-icon" sizes="76x76" href="/apple-icon-76x76.png">
    <link rel="apple-touch-icon" sizes="114x114" href="/apple-icon-114x114.png">
    <link rel="apple-touch-icon" sizes="120x120" href="/apple-icon-120x120.png">
    <link rel="apple-touch-icon" sizes="144x144" href="/apple-icon-144x144.png">
    <link rel="apple-touch-icon" sizes="152x152" href="/apple-icon-152x152.png">
    <link rel="apple-touch-icon" sizes="180x180" href="/apple-icon-180x180.png">
    <link rel="icon" type="image/png" href="/favicon-32x32.png" sizes="32x32">
    <link rel="icon" type="image/png" href="/android-chrome-192x192.png" sizes="192x192">
    <link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
    <link rel="icon" type="image/png" href="/favicon-16x16.png" sizes="16x16">
    <link rel="manifest" href="/manifest.json">
    <meta name="apple-mobile-web-app-title" content="Elastic">
    <meta name="application-name" content="Elastic">
    <meta name="msapplication-TileColor" content="#ffffff">
    <meta name="msapplication-TileImage" content="/mstile-144x144.png">
    <meta name="theme-color" content="#ffffff">
    <meta name="naver-site-verification" content="936882c1853b701b3cef3721758d80535413dbfd">
    <meta name="yandex-verification" content="d8a47e95d0972434">
    <meta name="localized" content="true">
    <meta name="st:robots" content="follow,index">
    <meta property="og:image" content="https://www.elastic.co/static/images/elastic-logo-200.png">
    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
    <link rel="icon" href="/favicon.ico" type="image/x-icon">
    <link rel="apple-touch-icon-precomposed" sizes="64x64" href="/favicon_64x64_16bit.png">
    <link rel="apple-touch-icon-precomposed" sizes="32x32" href="/favicon_32x32.png">
    <link rel="apple-touch-icon-precomposed" sizes="16x16" href="/favicon_16x16.png">
    <!-- Give IE8 a fighting chance -->
    <!--[if lt IE 9]>
    <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
    <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
    <![endif]-->
    <link rel="stylesheet" type="text/css" href="/guide/static/styles.css">
  </head>

  <!--© 2015-2021 Elasticsearch B.V. Copying, publishing and/or distributing without written permission is strictly prohibited.-->

  <body>
    <!-- Google Tag Manager -->
    <script>dataLayer = [];</script><noscript><iframe src="//www.googletagmanager.com/ns.html?id=GTM-58RLH5" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
    <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= '//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-58RLH5');</script>
    <!-- End Google Tag Manager -->

    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-12395217-16"></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());
      gtag('config', 'UA-12395217-16');
    </script>

    <!--BEGIN QUALTRICS WEBSITE FEEDBACK SNIPPET-->
    <script type="text/javascript">
      (function(){var g=function(e,h,f,g){
      this.get=function(a){for(var a=a+"=",c=document.cookie.split(";"),b=0,e=c.length;b<e;b++){for(var d=c[b];" "==d.charAt(0);)d=d.substring(1,d.length);if(0==d.indexOf(a))return d.substring(a.length,d.length)}return null};
      this.set=function(a,c){var b="",b=new Date;b.setTime(b.getTime()+6048E5);b="; expires="+b.toGMTString();document.cookie=a+"="+c+b+"; path=/; "};
      this.check=function(){var a=this.get(f);if(a)a=a.split(":");else if(100!=e)"v"==h&&(e=Math.random()>=e/100?0:100),a=[h,e,0],this.set(f,a.join(":"));else return!0;var c=a[1];if(100==c)return!0;switch(a[0]){case "v":return!1;case "r":return c=a[2]%Math.floor(100/c),a[2]++,this.set(f,a.join(":")),!c}return!0};
      this.go=function(){if(this.check()){var a=document.createElement("script");a.type="text/javascript";a.src=g;document.body&&document.body.appendChild(a)}};
      this.start=function(){var a=this;window.addEventListener?window.addEventListener("load",function(){a.go()},!1):window.attachEvent&&window.attachEvent("onload",function(){a.go()})}};
      try{(new g(100,"r","QSI_S_ZN_emkP0oSe9Qrn7kF","https://znemkp0ose9qrn7kf-elastic.siteintercept.qualtrics.com/WRSiteInterceptEngine/?Q_ZID=ZN_emkP0oSe9Qrn7kF")).start()}catch(i){}})();
    </script><div id="ZN_emkP0oSe9Qrn7kF"><!--DO NOT REMOVE-CONTENTS PLACED HERE--></div>
    <!--END WEBSITE FEEDBACK SNIPPET-->

    <div id="elastic-nav" style="display:none;"></div>
    <script src="https://www.elastic.co/elastic-nav.js"></script>

    <!-- Subnav -->
    <div>
      <div>
        <div class="tertiary-nav d-none d-md-block">
          <div class="container">
            <div class="p-t-b-15 d-flex justify-content-between nav-container">
              <div class="breadcrum-wrapper"><span><a href="/guide/" style="font-size: 14px; font-weight: 600; color: #000;">Docs</a></span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="main-container">
      <section id="content">
        <div class="content-wrapper">

          <section id="guide" lang="en">
            <div class="container">
              <div class="row">
                <div class="col-xs-12 col-sm-8 col-md-8 guide-section">
                  <!-- start body -->
                  <div class="page_header">
<p>
  <strong>WARNING</strong>: The 2.x versions of Elasticsearch have passed their
  <a href="https://www.elastic.co/support/eol">EOL dates</a>. If you are running
  a 2.x version, we strongly advise you to upgrade.
</p>
<p>
  This documentation is no longer maintained and may be removed. For the latest
  information, see the <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html">current
  Elasticsearch documentation</a>.
</p>
</div>
<div id="content">
<div class="breadcrumbs">
<span class="breadcrumb-link"><a href="index.html">Elasticsearch: The Definitive Guide [2.x]</a></span>
»
<span class="breadcrumb-link"><a href="languages.html">Dealing with Human Language</a></span>
»
<span class="breadcrumb-link"><a href="stopwords.html">Stopwords: Performance Versus Precision</a></span>
»
<span class="breadcrumb-node">Stopwords and Phrase Queries</span>
</div>
<div class="navheader">
<span class="prev">
<a href="common-terms.html">« Divide and Conquer</a>
</span>
<span class="next">
<a href="common-grams.html">common_grams Token Filter »</a>
</span>
</div>
<div class="section">
<div class="titlepage"><div><div>
<h2 class="title">
<a id="stopwords-phrases"></a>Stopwords and Phrase Queries<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/240_Stopwords/50_Phrase_queries.asciidoc">edit</a>
</h2>
</div></div></div>
<p>About 5% of all queries are phrase queries (see <a class="xref" href="phrase-matching.html" title="Phrase Matching">Phrase Matching</a>), but they
often account for the majority of slow queries. Phrase queries can perform
poorly, especially if the phrase includes very common words; a phrase like
“To be, or not to be” could be considered pathological. The reason for this
has to do with the amount of data that is necessary to support proximity
matching.</p>
<p>In <a class="xref" href="pros-cons-stopwords.html" title="Pros and Cons of Stopwords">Pros and Cons of Stopwords</a>, we said that removing stopwords saves only a small
amount of space in the inverted index.  That was only partially true.  A
typical index may contain, among other data, some or all of the following:</p>
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
Terms dictionary
</span>
</dt>
<dd>
A sorted list of all terms that appear in the documents in the index,
and a count of the number of documents that contain each term.
</dd>
<dt>
<span class="term">
Postings list
</span>
</dt>
<dd>
A list of which documents contain each term.
</dd>
<dt>
<span class="term">
Term frequency
</span>
</dt>
<dd>
How often each term appears in each document.
</dd>
<dt>
<span class="term">
Positions
</span>
</dt>
<dd>
The position of each term within each document, for phrase and proximity
queries.
</dd>
<dt>
<span class="term">
Offsets
</span>
</dt>
<dd>
The start and end character offsets of each term in each document, for
snippet highlighting. Disabled by default.
</dd>
<dt>
<span class="term">
Norms
</span>
</dt>
<dd>
A factor used to normalize fields of different lengths, to give shorter
fields more weight.
</dd>
</dl>
</div>
<p>Removing stopwords from the index may save a small amount of space in the
<em>terms dictionary</em> and the <em>postings list</em>, but <em>positions</em> and <em>offsets</em> are
another matter. Positions and offsets data can easily double, triple, or
quadruple index size.</p>
<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="_positions_data"></a>Positions Data<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/240_Stopwords/50_Phrase_queries.asciidoc">edit</a>
</h3>
</div></div></div>
<p>Positions are enabled on <code class="literal">analyzed</code> string fields by default, so that phrase
queries will work out of the box. The more often that a term appears, the more
space is needed to store its position data. Very common words, by
definition, appear very commonly, and their positions data can run to megabytes
or gigabytes on large collections.</p>
<p>Running a phrase query on a high-frequency word like <code class="literal">the</code> might result in
gigabytes of data being read from disk. That data will be stored in the kernel
filesystem cache to speed up later access, which seems like a good thing, but
it might cause other data to be evicted from the cache, which will slow
subsequent queries.</p>
<p>This is clearly a problem that needs solving.</p>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="index-options"></a>Index Options<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/240_Stopwords/50_Phrase_queries.asciidoc">edit</a>
</h3>
</div></div></div>
<p>The first question you should ask yourself is: <em>Do you need phrase or
proximity queries?</em></p>
<p>Often, the answer is no.  For many use cases, such as logging, you need to
know <em>whether</em> a term appears in a document — information that is provided
by the postings list—​but not <em>where</em> it appears. Or perhaps you need to use
phrase queries on one or two fields, but you can disable positions data on all
of the other analyzed <code class="literal">string</code> fields.</p>
<p>The <code class="literal">index_options</code> parameter allows you to control what information is stored
in the index for each field.  Valid values are as follows:</p>
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<code class="literal">docs</code>
</span>
</dt>
<dd>
Only store which documents contain which terms. This is the default for
<code class="literal">not_analyzed</code> string fields.
</dd>
<dt>
<span class="term">
<code class="literal">freqs</code>
</span>
</dt>
<dd>
Store <code class="literal">docs</code> information, plus how often each term appears in each
document. Term frequencies are needed for complete <a class="xref" href="relevance-intro.html" title="What Is Relevance?">TF/IDF</a>
relevance calculations, but they are not required if you just need to know
whether a document contains a particular term.
</dd>
<dt>
<span class="term">
<code class="literal">positions</code>
</span>
</dt>
<dd>
Store <code class="literal">docs</code> and <code class="literal">freqs</code>, plus the position of each term in each document.
This is the default for <code class="literal">analyzed</code> string fields, but can be disabled if
phrase/proximity matching is not needed.
</dd>
<dt>
<span class="term">
<code class="literal">offsets</code>
</span>
</dt>
<dd>
Store <code class="literal">docs</code>, <code class="literal">freqs</code>, <code class="literal">positions</code>, and the start and end character offsets
of each term in the original string.  This information is used by the
<a href="/guide/en/elasticsearch/reference/5.6/search-request-highlighting.html#_unified_highlighter" class="ulink" target="_top"><code class="literal">unified</code> highlighter</a>
but is disabled by default.
</dd>
</dl>
</div>
<p>You can set <code class="literal">index_options</code> on fields added at index creation time, or when
adding new fields by using the <code class="literal">put-mapping</code> API. This setting can’t be changed
on existing fields:</p>
<div class="pre_wrapper lang-json">
<pre class="programlisting prettyprint lang-json">PUT /my_index
{
  "mappings": {
    "my_type": {
      "properties": {
        "title": { <a id="CO175-1"></a><i class="conum" data-value="1"></i>
          "type":          "string"
       },
        "content": { <a id="CO175-2"></a><i class="conum" data-value="2"></i>
          "type":          "string",
          "index_options": "freqs"
      }
    }
  }
}</pre>
</div>
<div class="calloutlist">
<table border="0" summary="Callout list">
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO175-1"><i class="conum" data-value="1"></i></a></p>
</td>
<td align="left" valign="top">
<p>The <code class="literal">title</code> field uses the default setting of <code class="literal">positions</code>, so
it is suitable for phrase/proximity queries.</p>
</td>
</tr>
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO175-2"><i class="conum" data-value="2"></i></a></p>
</td>
<td align="left" valign="top">
<p>The <code class="literal">content</code> field has positions disabled and so cannot be used
for phrase/proximity queries.</p>
</td>
</tr>
</table>
</div>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="_stopwords"></a>Stopwords<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/240_Stopwords/50_Phrase_queries.asciidoc">edit</a>
</h3>
</div></div></div>
<p>Removing stopwords is one way of reducing the size of the positions data quite
dramatically.   An index with stopwords removed can still be used for phrase
queries because the original positions of the remaining terms are maintained,
as we saw in <a class="xref" href="using-stopwords.html#maintaining-positions" title="Maintaining Positions">Maintaining Positions</a>. But of course, excluding terms from
the index reduces searchability. We wouldn’t be able to differentiate between
the two phrases <em>Man in the moon</em> and <em>Man on the moon</em>.</p>
<p>Fortunately, there is a way to have our cake and eat it: the
<a class="xref" href="common-grams.html" title="common_grams Token Filter"><code class="literal">common_grams</code> token filter</a>.</p>
</div>

</div>
<div class="navfooter">
<span class="prev">
<a href="common-terms.html">« Divide and Conquer</a>
</span>
<span class="next">
<a href="common-grams.html">common_grams Token Filter »</a>
</span>
</div>
</div>

                  <!-- end body -->
                </div>
                <div class="col-xs-12 col-sm-4 col-md-4" id="right_col">
                  <div id="rtpcontainer" style="display: block;">
                    <div class="mktg-promo">
                      <h3>Most Popular</h3>
                      <ul class="icons">
                        <li class="icon-elasticsearch-white"><a href="https://www.elastic.co/webinars/getting-started-elasticsearch?baymax=default&amp;elektra=docs&amp;storm=top-video">Get Started with Elasticsearch: Video</a></li>
                        <li class="icon-kibana-white"><a href="https://www.elastic.co/webinars/getting-started-kibana?baymax=default&amp;elektra=docs&amp;storm=top-video">Intro to Kibana: Video</a></li>
                        <li class="icon-logstash-white"><a href="https://www.elastic.co/webinars/introduction-elk-stack?baymax=default&amp;elektra=docs&amp;storm=top-video">ELK for Logs &amp; Metrics: Video</a></li>
                      </ul>
                    </div>
                  </div>
                </div>
              </div>
            </div>
          </section>

        </div>


<div id="elastic-footer"></div>
<script src="https://www.elastic.co/elastic-footer.js"></script>
<!-- Footer Section end-->

      </section>
    </div>

<script src="/guide/static/jquery.js"></script>
<script type="text/javascript" src="/guide/static/docs.js"></script>
<script type="text/javascript">
  window.initial_state = {}</script>
  </body>
</html>
